The source of the dataset comes from R for Data Science’s Tidy Tuesday Project, which provides weekly releases of raw datasets for users to wrangle and analyze. The data itself originates from FiveThirtyEight, containing movies ranging from 1970 to 2013, and merges data from several sources:
raw_bechdel = read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-09/raw_bechdel.csv')
movies = read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-03-09/movies.csv')
The dataset was distributed in two files, movies.csv,
raw_bechdel.csv.
movies.csv: A data file of 1794 observations and 34
variables, which includes movies from 1970 to 2013, and contains
variables for title, IMDB ID, year, Bechdel test (uncleaned, cleaned,
binary pass/fail), genres, ratings, budget, domestic and international
gross revenue, and 2013-adjusted budget and revenues.raw_bechdel.csv: A data file of 8839 observations and 5
variables, containing movies from 1888 to 2021, and contains variables
for title, movie ID, IMDB ID, and raw Bechdel test score (0 to 3).The data cleaning steps involve the following:
domgross,
intgross, domgross_2013,
intgross_2013, and runtimebinary character variable to a logical
variable pass_bechdel that indicates whether the movie
passes or fails the Bechdel Testdecade_code
variable into a decade factor variable according to the
decade of releaseaward_winner variable that is
TRUE if the movie has won at least 1 Oscar, Golden Globe,
or BAFTA awardgenre character variable into dummy-coded
variables indicating whether the movie falls under one of 20 different
genresclean_test variable into 4
levels corresponding to the Bechdel Test criteria:
profit = intgross_2013 – budget_2013ROI = profit / budget_2013movies_df = movies %>%
mutate(test = as.factor(test),
clean_test = fct_recode(clean_test,
"Less than 2 women" = "nowomen",
"Don't talk to each other" = "notalk",
"Only talk about men" = "men",
"Dubious" = "dubious",
"Passes Bechdel" = "ok"),
clean_test = fct_relevel(clean_test, c("Less than 2 women", "Don't talk to each other",
"Only talk about men", "Dubious", "Passes Bechdel")),
binary = ifelse(binary == "PASS", TRUE, FALSE),
domgross = as.numeric(domgross),
intgross = as.numeric(intgross),
domgross_2013 = as.numeric(domgross_2013),
intgross_2013 = as.numeric(intgross_2013),
decade_code = case_when(year >= 1970 & year < 1980 ~ "1970-1979",
year >= 1980 & year < 1990 ~ "1980-1989",
year >= 1990 & year < 2000 ~ "1990-1999",
year >= 2000 & year < 2010 ~ "2000-2009",
year >= 2010 & year < 2020 ~ "2010 - present"),
decade_code = as.factor(decade_code),
title = str_replace(title, "'", "'"),
title = str_replace(title, "&", "&"),
title = str_replace(title, "à", "à"),
title = str_replace(title, "å", "å"),
title = str_replace(title, "ä", "ä"),
runtime = as.numeric(str_replace(runtime, " min", "")),
award_winner = ifelse(str_detect(awards, "Won") & (str_detect(awards, "Golden Globe") |
str_detect(awards, "Oscar") | str_detect(awards, "BAFTA")), T, F),
profit = intgross_2013 - budget_2013,
ROI = profit/budget_2013) %>%
separate(genre, into = c("g1", "g2", "g3"), sep = ", ") %>%
rename("pass_bechdel" = binary,
"bechdel_score" = clean_test,
"decade" = decade_code) %>%
mutate(
action = ifelse(g1 == "Action" | g1 == "Action" | g1 == "Action", TRUE, FALSE),
adventure = ifelse(g1 == "Adventure" | g1 == "Adventure" | g1 == "Adventure", TRUE, FALSE),
animation = ifelse(g1 == "Animation" | g1 == "Animation" | g1 == "Animation", TRUE, FALSE),
biography = ifelse(g1 == "Biography" | g1 == "Biography" | g1 == "Biography", TRUE, FALSE),
comedy = ifelse(g1 == "Comedy" | g1 == "Comedy" | g1 == "Comedy", TRUE, FALSE),
crime = ifelse(g1 == "Crime" | g1 == "Crime" | g1 == "Crime", TRUE, FALSE),
documentary = ifelse(g1 == "Documentary" | g1 == "Documentary" | g1 == "Documentary", TRUE, FALSE),
drama = ifelse(g1 == "Drama" | g1 == "Drama" | g1 == "Drama", TRUE, FALSE),
family = ifelse(g1 == "Family" | g1 == "Family" | g1 == "Family", TRUE, FALSE),
fantasy = ifelse(g1 == "Fantasy" | g1 == "Fantasy" | g1 == "Fantasy", TRUE, FALSE),
history = ifelse(g1 == "History" | g1 == "History" | g1 == "History", TRUE, FALSE),
horror = ifelse(g1 == "Horror" | g1 == "Horror" | g1 == "Horror", TRUE, FALSE),
music = ifelse(g1 == "Music" | g1 == "Music" | g1 == "Music", TRUE, FALSE),
musical = ifelse(g1 == "Musical" | g1 == "Musical" | g1 == "Musical", TRUE, FALSE),
mystery = ifelse(g1 == "Mystery" | g1 == "Mystery" | g1 == "Mystery", TRUE, FALSE),
romance = ifelse(g1 == "Romance" | g1 == "Romance" | g1 == "Romance", TRUE, FALSE),
sci_fi = ifelse(g1 == "Sci-Fi" | g1 == "Sci-Fi" | g1 == "Sci-Fi", TRUE, FALSE),
sport = ifelse(g1 == "Sport" | g1 == "Sport" | g1 == "Sport", TRUE, FALSE),
thriller = ifelse(g1 == "Thriller" | g1 == "Thriller" | g1 == "Thriller", TRUE, FALSE),
war = ifelse(g1 == "War" | g1 == "War" | g1 == "War", TRUE, FALSE),
western = ifelse(g1 == "Western" | g1 == "Western" | g1 == "Western", TRUE, FALSE)
) %>%
select(year, title, bechdel_score, pass_bechdel, budget_2013:intgross_2013, decade, imdb_id, language, metascore, imdb_rating, award_winner, runtime, profit, ROI, action:western)
readr::write_csv(movies_df, "./movies_df")
head(movies_df) %>%
kableExtra::kbl() %>%
kableExtra::kable_paper("striped", "hover", full_width = F) %>%
kableExtra::scroll_box(width = "100%", height = "300px")
| year | title | bechdel_score | pass_bechdel | budget_2013 | domgross_2013 | intgross_2013 | decade | imdb_id | language | metascore | imdb_rating | award_winner | runtime | profit | ROI | action | adventure | animation | biography | comedy | crime | documentary | drama | family | fantasy | history | horror | music | musical | mystery | romance | sci_fi | sport | thriller | war | western |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | 21 & Over | Don’t talk to each other | FALSE | 13000000 | 25682380 | 42195766 | 2010 - present | 1711425 | NA | NA | NA | NA | NA | 29195766 | 2.2458282 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 2012 | Dredd 3D | Passes Bechdel | TRUE | 45658735 | 13611086 | 41467257 | 2010 - present | 1343727 | NA | NA | NA | NA | NA | -4191478 | -0.0918001 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| 2013 | 12 Years a Slave | Don’t talk to each other | FALSE | 20000000 | 53107035 | 158607035 | 2010 - present | 2024544 | English | 97 | 8.3 | TRUE | 134 | 138607035 | 6.9303517 | FALSE | FALSE | FALSE | TRUE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE |
| 2013 | 2 Guns | Don’t talk to each other | FALSE | 61000000 | 75612460 | 132493015 | 2010 - present | 1272878 | English, Spanish | 55 | 6.8 | FALSE | 109 | 71493015 | 1.1720166 | TRUE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE |
| 2013 | 42 | Only talk about men | FALSE | 40000000 | 95020213 | 95020213 | 2010 - present | 0453562 | English | 62 | 7.6 | FALSE | 128 | 55020213 | 1.3755053 | FALSE | FALSE | FALSE | TRUE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE |
| 2013 | 47 Ronin | Only talk about men | FALSE | 225000000 | 38362475 | 145803842 | 2010 - present | 1335975 | English, Japanese | 29 | 6.6 | FALSE | 118 | -79196158 | -0.3519829 | TRUE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE | FALSE |
Our resulting dataset contains 1794 observations and 37 variables, indicating information basic information about the movie’s decade, genre, and runtime, whether it passes the Bechdel Test, and information about its budget, revenue, and ratings.
movies_df %>%
group_by(bechdel_score) %>%
summarise(N = n()) %>%
mutate(Proportion = N/sum(N)) %>%
rename("Bechdel Test Criterion" = bechdel_score) %>%
knitr::kable(digits = 3) %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover"))
| Bechdel Test Criterion | N | Proportion |
|---|---|---|
| Less than 2 women | 141 | 0.079 |
| Don’t talk to each other | 514 | 0.287 |
| Only talk about men | 194 | 0.108 |
| Dubious | 142 | 0.079 |
| Passes Bechdel | 803 | 0.448 |
less2_df = movies_df %>%
filter(bechdel_score == "Less than 2 women") %>%
group_by(decade, bechdel_score) %>%
summarise(n = n())
notalk_df = movies_df %>%
filter(bechdel_score == "Don't talk to each other") %>%
group_by(decade, bechdel_score) %>%
summarise(n = n())
talkmen_df = movies_df %>%
filter(bechdel_score == "Only talk about men") %>%
group_by(decade, bechdel_score) %>%
summarise(n = n())
dubious_df = movies_df %>%
filter(bechdel_score == "Dubious") %>%
group_by(decade, bechdel_score) %>%
summarise(n = n())
pass_df = movies_df %>%
filter(bechdel_score == "Passes Bechdel") %>%
group_by(decade, bechdel_score) %>%
summarise(n = n())
table = bind_cols(less2_df, notalk_df, talkmen_df, dubious_df, pass_df)
table %>%
plot_ly(x = ~decade...1, y = ~n...3, type = "bar", name = "Less than 2 women",
marker = list(color = "darkred")) %>%
add_trace(y = ~n...6, name = "Don't talk to each other", marker = list(color = "red")) %>%
add_trace(y = ~n...9, name = "Only talk about men", marker = list(color = "darkorange")) %>%
add_trace(y = ~n...12, name = "Dubious", marker = list(color = "yellow")) %>%
add_trace(y = ~n...15, name = "Passes Bechdel", marker = list(color = "lightgreen")) %>%
layout(barmode = "stack",
xaxis = list(title = "Decade"),
yaxis = list(title = "Count"))
movies_df %>%
plot_ly(y = ~imdb_rating, x = ~bechdel_score, type = "scatter",
mode = "markers", marker = list(color = ~imdb_rating)) %>%
layout(yaxis = list(title = list(text = "IMDB Rating", standoff = 5), tickfont = list(size = 10), gridcolor = "white"),
xaxis = list(title = "Bechdel Criterion"), tickfont = list(size = 10), gridcolor = "gray")
Next, we want to explore how movie budgets may differ according to the primacy of women’s roles in movies. Grouping by Bechdel score, we can compute the median budget, adjusted to 2013 inflation.
movies_df %>%
group_by(bechdel_score) %>%
summarise(median_budget = median(budget_2013)) %>%
plot_ly(x = ~median_budget, y = ~bechdel_score, type = "bar", color = ~bechdel_score, colors = "YlGn") %>%
layout(yaxis = list(title = "Bechdel Criterion", standoff = 10),
xaxis = list(title = "Median movie budget ($)"),
legend = list(reverse = TRUE))
movies_df %>%
plot_ly(x = ~bechdel_score, y = ~budget_2013, type = "box", text = ~title) %>%
layout(yaxis = list(title = "Movie budget ($)", standoff = 10),
xaxis = list(title = "Bechdel Criterion" ),
legend = list(reverse = TRUE))
We can visualize the median budgets with a bar chart to see that movies featuring two women who don’t talk to each other appear to have much larger budgets than the rest. Movies that pass the Bechdel test also appear to have slightly smaller budgets than movies that don’t pass.
Next steps…
budget_dist_p = movies_df %>% ggplot(aes(x = budget_2013)) + geom_histogram(alpha = 0.8, color = "white") +
labs(
x = "Count",
y = "Budget ($, 2013-adjusted)",
title = "Distribution of movie budgets")
ggplotly(budget_dist_p)
It appears that budget is heavily right-skewed. For this, we will need to run a Kruskal-Wallace test in place of an F-test. Below are the results:
kruskal.test(budget_2013 ~ bechdel_score, data = movies_df) %>%
broom::tidy() %>%
rename("Test statistic" = statistic,
"p-value" = p.value,
"Parameter (df)" = parameter,
"Method" = method) %>%
kableExtra::kbl() %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) %>%
kableExtra::kable_styling(font_size = 12)
| Test statistic | p-value | Parameter (df) | Method |
|---|---|---|---|
| 57.04459 | 0 | 4 | Kruskal-Wallis rank sum test |